{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "##1. import工具包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "##2. 读数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148</td>\n",
       "      <td>72</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85</td>\n",
       "      <td>66</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183</td>\n",
       "      <td>64</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89</td>\n",
       "      <td>66</td>\n",
       "      <td>23</td>\n",
       "      <td>94</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137</td>\n",
       "      <td>40</td>\n",
       "      <td>35</td>\n",
       "      <td>168</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0          6                           148              72   \n",
       "1          1                            85              66   \n",
       "2          8                           183              64   \n",
       "3          1                            89              66   \n",
       "4          0                           137              40   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin   BMI  \\\n",
       "0                           35              0  33.6   \n",
       "1                           29              0  26.6   \n",
       "2                            0              0  23.3   \n",
       "3                           23             94  28.1   \n",
       "4                           35            168  43.1   \n",
       "\n",
       "   Diabetes_pedigree_function  Age  Target  \n",
       "0                       0.627   50       1  \n",
       "1                       0.351   31       0  \n",
       "2                       0.672   32       1  \n",
       "3                       0.167   21       0  \n",
       "4                       2.288   33       1  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv(\"pima-indians-diabetes.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 768 entries, 0 to 767\n",
      "Data columns (total 9 columns):\n",
      "pregnants                       768 non-null int64\n",
      "Plasma_glucose_concentration    768 non-null int64\n",
      "blood_pressure                  768 non-null int64\n",
      "Triceps_skin_fold_thickness     768 non-null int64\n",
      "serum_insulin                   768 non-null int64\n",
      "BMI                             768 non-null float64\n",
      "Diabetes_pedigree_function      768 non-null float64\n",
      "Age                             768 non-null int64\n",
      "Target                          768 non-null int64\n",
      "dtypes: float64(2), int64(7)\n",
      "memory usage: 54.1 KB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>3.845052</td>\n",
       "      <td>120.894531</td>\n",
       "      <td>69.105469</td>\n",
       "      <td>20.536458</td>\n",
       "      <td>79.799479</td>\n",
       "      <td>31.992578</td>\n",
       "      <td>0.471876</td>\n",
       "      <td>33.240885</td>\n",
       "      <td>0.348958</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>3.369578</td>\n",
       "      <td>31.972618</td>\n",
       "      <td>19.355807</td>\n",
       "      <td>15.952218</td>\n",
       "      <td>115.244002</td>\n",
       "      <td>7.884160</td>\n",
       "      <td>0.331329</td>\n",
       "      <td>11.760232</td>\n",
       "      <td>0.476951</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.078000</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>99.000000</td>\n",
       "      <td>62.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>27.300000</td>\n",
       "      <td>0.243750</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>3.000000</td>\n",
       "      <td>117.000000</td>\n",
       "      <td>72.000000</td>\n",
       "      <td>23.000000</td>\n",
       "      <td>30.500000</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>0.372500</td>\n",
       "      <td>29.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>6.000000</td>\n",
       "      <td>140.250000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>127.250000</td>\n",
       "      <td>36.600000</td>\n",
       "      <td>0.626250</td>\n",
       "      <td>41.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>17.000000</td>\n",
       "      <td>199.000000</td>\n",
       "      <td>122.000000</td>\n",
       "      <td>99.000000</td>\n",
       "      <td>846.000000</td>\n",
       "      <td>67.100000</td>\n",
       "      <td>2.420000</td>\n",
       "      <td>81.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "count  768.000000                    768.000000      768.000000   \n",
       "mean     3.845052                    120.894531       69.105469   \n",
       "std      3.369578                     31.972618       19.355807   \n",
       "min      0.000000                      0.000000        0.000000   \n",
       "25%      1.000000                     99.000000       62.000000   \n",
       "50%      3.000000                    117.000000       72.000000   \n",
       "75%      6.000000                    140.250000       80.000000   \n",
       "max     17.000000                    199.000000      122.000000   \n",
       "\n",
       "       Triceps_skin_fold_thickness  serum_insulin         BMI  \\\n",
       "count                   768.000000     768.000000  768.000000   \n",
       "mean                     20.536458      79.799479   31.992578   \n",
       "std                      15.952218     115.244002    7.884160   \n",
       "min                       0.000000       0.000000    0.000000   \n",
       "25%                       0.000000       0.000000   27.300000   \n",
       "50%                      23.000000      30.500000   32.000000   \n",
       "75%                      32.000000     127.250000   36.600000   \n",
       "max                      99.000000     846.000000   67.100000   \n",
       "\n",
       "       Diabetes_pedigree_function         Age      Target  \n",
       "count                  768.000000  768.000000  768.000000  \n",
       "mean                     0.471876   33.240885    0.348958  \n",
       "std                      0.331329   11.760232    0.476951  \n",
       "min                      0.078000   21.000000    0.000000  \n",
       "25%                      0.243750   24.000000    0.000000  \n",
       "50%                      0.372500   29.000000    0.000000  \n",
       "75%                      0.626250   41.000000    1.000000  \n",
       "max                      2.420000   81.000000    1.000000  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pregnants                         0\n",
      "Plasma_glucose_concentration      5\n",
      "blood_pressure                   35\n",
      "Triceps_skin_fold_thickness     227\n",
      "serum_insulin                   374\n",
      "BMI                              11\n",
      "Diabetes_pedigree_function        0\n",
      "Age                               0\n",
      "Target                            0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "NaN_col_names = ['Plasma_glucose_concentration','blood_pressure','Triceps_skin_fold_thickness','serum_insulin','BMI']\n",
    "train[NaN_col_names] = train[NaN_col_names].replace(0, np.NaN)\n",
    "print(train.isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "##3. 缺省值替换"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>Triceps_skin_fold_thickness_Missing</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>29.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>23.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>32.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>45.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Triceps_skin_fold_thickness  Triceps_skin_fold_thickness_Missing\n",
       "0                         35.0                                    0\n",
       "1                         29.0                                    0\n",
       "2                          NaN                                    1\n",
       "3                         23.0                                    0\n",
       "4                         35.0                                    0\n",
       "5                          NaN                                    1\n",
       "6                         32.0                                    0\n",
       "7                          NaN                                    1\n",
       "8                         45.0                                    0\n",
       "9                          NaN                                    1"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train['Triceps_skin_fold_thickness_Missing'] = train['Triceps_skin_fold_thickness'].apply(lambda x: 1 if pd.isnull(x) else 0)\n",
    "train[['Triceps_skin_fold_thickness','Triceps_skin_fold_thickness_Missing']].head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x11302ce50>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAELCAYAAADDZxFQAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAGk1JREFUeJzt3X+UVXW9//HnS34IJorA6OUyIOSvLqQOOaKVeU3tC9HXxH544Vv5O7xrkTdvZVmrpcjNtaprYeXNwvyBZajXStHrj/wZ1c0QEBEwl5glw1VBVK6GqEzv7x/7M3KcPjOcgdlzDszrsdZZc/Znf/be73MGzms+e++ztyICMzOz9napdQFmZlafHBBmZpblgDAzsywHhJmZZTkgzMwsywFhZmZZDggzM8tyQJiZWZYDwszMsvrWuoDtMWzYsBg9enStyzAz26EsXrz4+Yho2Fq/HTogRo8ezaJFi2pdhpnZDkXSn6vp511MZmaW5YAwM7MsB4SZmWXt0McgzMxq5Y033qClpYVNmzbVupQODRgwgMbGRvr167dNyzsgzMy2QUtLC4MGDWL06NFIqnU5fyMiWL9+PS0tLYwZM2ab1uFdTGZm22DTpk0MHTq0LsMBQBJDhw7drhGOA8LMbBvVazi02d76HBBmZpblYxBmZt1g/fr1HHfccQA8++yz9OnTh4aG4svKCxcupH///t2+zSVLlrB27VomTZrU7esGBwSHnXdtrUuoG4v//ZRal2C2wxo6dChLly4FYObMmey+++584QtfqHr51tZW+vTp06VtLlmyhOXLl5cWEN7FZGZWshNOOIHDDjuMcePG8aMf/QiAzZs3M3jwYM4991wOOeQQFi5cyPz58znooIM47LDDOOecc5gyZQoAr7zyCqeddhoTJkxg/Pjx3Hrrrbz66qvMmjWL6667jqamJm666aZur7u0EYSkAcACYNe0nZsi4kJJ1wD/CGxIXU+LiKUqjqZ8B5gMbEztS8qqz8ysp8ydO5chQ4awceNGmpub+ehHP8qgQYPYsGEDRx99NJdeeikbN27kwAMP5Le//S2jRo3i5JNPfnP5WbNmMWnSJK655hpefPFFjjjiCJYtW8YFF1zA8uXLufTSS0upu8wRxGvAsRFxKNAETJJ0ZJp3XkQ0pcfS1PZB4ID0mA5cXmJtZmY9Zvbs2Rx66KG8+93vpqWlhSeffBKA/v37c9JJJwGwcuVKDjroIPbdd18kMW3atDeX/+Uvf8nFF19MU1MT73//+9m0aRNPP/106XWXNoKIiABeSZP90iM6WeRE4Nq03IOSBksaHhHPlFWjmVnZ7rnnHhYsWMCDDz7IwIEDOeqoo978bsLAgQOrOhU1Irj55pvZb7/93tK+YMGCUmpuU+oxCEl9JC0F1gJ3R8Tv06yLJS2TNFvSrqltBLC6YvGW1GZmtsPasGEDQ4YMYeDAgaxYsYKHHnoo22/s2LE8/vjjrF69mojghhtueHPexIkT+d73vvfm9MMPPwzAoEGDePnll0urvdSAiIjWiGgCGoEJkt4JfBl4B3A4MAT4UlfWKWm6pEWSFq1bt67bazYz604f+tCH2LhxI2PHjuWrX/0qRxxxRLbfbrvtxmWXXcbxxx9Pc3MzgwcPZs899wTgwgsv5C9/+QsHH3ww48aNY+bMmQAce+yxPPLII4wfP37HOkhdKSJeknQ/MCkiLknNr0m6Gmg7D2wNMLJiscbU1n5dc4A5AM3NzZ3tsjIzq4m2D3AoLph31113Zfu99NJLb5k+/vjjefzxx4kIzj77bJqbmwF429vexhVXXPE3yzc0NJR607TSRhCSGiQNTs8HAh8A/iBpeGoTMAVYnhaZD5yiwpHABh9/MLPe5PLLL6epqYmxY8fy6quv8ulPf7qm9ZQ5ghgOzJXUhyKIboyI2yTdJ6kBELAU+OfU/3aKU1xXUZzmenqJtZmZ1Z3zzjuP8847r9ZlvKnMs5iWAeMz7cd20D+AGWXVY2ZmXeNvUpuZWZYDwszMshwQZmaW1euv5mpm1h26+8rQ1Vxd+c477+Szn/0sra2tnHXWWZx//vndWoNHEGZmO6DW1lZmzJjBHXfcwcqVK5k3bx4rV67s1m04IMzMdkALFy5k//335+1vfzv9+/dn6tSp3HLLLd26DQeEmdkOaM2aNYwcueXiE42NjaxZ8zcXn9guDggzM8tyQJiZ7YBGjBjB6tVbLoDd0tLCiBHdewFsB4SZ2Q7o8MMP54knnuCpp57i9ddf5/rrr+fDH/5wt27Dp7mamXWDak5L7U59+/blsssuY+LEibS2tnLGGWcwbty47t1Gt67NzMx6zOTJk5k8eXJp6/cuJjMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZfk0VzOzbvD0rIO7dX2jLnh0q33OOOMMbrvtNvbee2+WL1/erdsHjyDMzHZYp512GnfeeWdp6y8tICQNkLRQ0iOSVki6KLWPkfR7Sask3SCpf2rfNU2vSvNHl1WbmdnO4Oijj2bIkCGlrb/MEcRrwLERcSjQBEySdCTwDWB2ROwPvAicmfqfCbyY2menfmZmViOlBUQUXkmT/dIjgGOBm1L7XGBKen5imibNP06SyqrPzMw6V+oxCEl9JC0F1gJ3A08CL0XE5tSlBWi7Pu0IYDVAmr8BGFpmfWZm1rFSAyIiWiOiCWgEJgDv2N51SpouaZGkRevWrdvuGs3MLK9HTnONiJck3Q+8GxgsqW8aJTQCbffIWwOMBFok9QX2BNZn1jUHmAPQ3NwcPVG/mdnWVHNaanebNm0aDzzwAM8//zyNjY1cdNFFnHnmmVtfsEqlBYSkBuCNFA4DgQ9QHHi+H/gYcD1wKtB2l+35afp3af59EeEAMDPrwLx580pdf5kjiOHAXEl9KHZl3RgRt0laCVwv6WvAw8CVqf+VwI8lrQJeAKaWWJuZmW1FaQEREcuA8Zn2P1Icj2jfvgn4eFn1mJlZ1/ib1GZm26je94Jvb30OCDOzbTBgwADWr19ftyEREaxfv54BAwZs8zp8sT4zs23Q2NhIS0sL9Xy6/YABA2hsbNzm5R0QZmbboF+/fowZM6bWZZTKu5jMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZTkgzMwsywFhZmZZDggzM8tyQJiZWZYDwszMshwQZmaW5YAwM7MsB4SZmWWVFhCSRkq6X9JKSSskfTa1z5S0RtLS9JhcscyXJa2S9LikiWXVZmZmW1fmHeU2A5+PiCWSBgGLJd2d5s2OiEsqO0saC0wFxgF/D9wj6cCIaC2xRjMz60BpI4iIeCYilqTnLwOPASM6WeRE4PqIeC0ingJWARPKqs/MzDrXI8cgJI0GxgO/T02fkbRM0lWS9kptI4DVFYu1kAkUSdMlLZK0qJ5vFm5mtqMrPSAk7Q78DDg3Iv4XuBzYD2gCngG+1ZX1RcSciGiOiOaGhoZur9fMzAqlBoSkfhThcF1E/BwgIp6LiNaI+CtwBVt2I60BRlYs3pjazMysBso8i0nAlcBjEfHtivbhFd1OApan5/OBqZJ2lTQGOABYWFZ9ZmbWuTLPYnov8CngUUlLU9tXgGmSmoAA/gScDRARKyTdCKykOANqhs9gMjOrndICIiJ+Aygz6/ZOlrkYuLismszMrHr+JrWZmWU5IMzMLMsBYWZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWAMDOzLAeEmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZVUVEJLurabNzMx2Hp3eclTSAGA3YJikvdhyC9E9gBEl12ZmZjW0tRHE2cBi4B3pZ9vjFuCyzhaUNFLS/ZJWSloh6bOpfYikuyU9kX7uldol6buSVklaJuld2/vizMxs23UaEBHxnYgYA3whIt4eEWPS49CI6DQggM3A5yNiLHAkMEPSWOB84N6IOAC4N00DfBA4ID2mA5dv+8syM7Pt1ekupjYR8T1J7wFGVy4TEdd2sswzwDPp+cuSHqPYLXUicEzqNhd4APhSar82IgJ4UNJgScPTeszMrIdVFRCSfgzsBywFWlNzAB0GRLvlRwPjgd8D+1R86D8L7JOejwBWVyzWktocEGZmNVBVQADNwNj0132XSNod+BlwbkT8r6Q350VESOrSOiVNp9gFxahRo7pajpmZVana70EsB/6uqyuX1I8iHK6LiJ+n5uckDU/zhwNrU/saYGTF4o2p7S0iYk5ENEdEc0NDQ1dLMjOzKlUbEMOAlZLukjS/7dHZAiqGClcCj0XEtytmzQdOTc9PpTgjqq39lHQ205HABh9/MDOrnWp3Mc3chnW/F/gU8KikpantK8DXgRslnQn8GTg5zbsdmAysAjYCp2/DNs3MrJtUexbTr7q64oj4DVu+WNfecZn+Aczo6nbMzKwc1Z7F9DLFWUsA/YF+wF8iYo+yCjMzs9qqdgQxqO15OrZwIsWX38zMbCfV5au5RuFmYGIJ9ZiZWZ2odhfTRyomd6H4XsSmUioyM7O6UO1ZTCdUPN8M/IliN5OZme2kqj0G4VNOzcx6mWpvGNQo6ReS1qbHzyQ1ll2cmZnVTrW7mK4Gfgp8PE1/MrV9oIyirDaennVwrUuoG6MueLTWJZjVXLVnMTVExNURsTk9rgF8ISQzs51YtQGxXtInJfVJj08C68sszMzMaqvagDiD4ppJz1Lcn+FjwGkl1WRmZnWg2mMQs4BTI+JFKO4rDVxCERxmZrYTqnYEcUhbOABExAsUd4gzM7OdVLUBsYukvdom0gii2tGHmZntgKr9kP8W8DtJ/5mmPw5cXE5JZmZWD6r9JvW1khYBx6amj0TEyvLKMjOzWqt6N1EKBIeCmVkv0eXLfZuZWe/ggDAzsywHhJmZZZUWEJKuSld+XV7RNlPSGklL02NyxbwvS1ol6XFJvludmVmNlTmCuAaYlGmfHRFN6XE7gKSxwFRgXFrm+5L6lFibmZltRWkBERELgBeq7H4icH1EvBYRTwGrgAll1WZmZltXi2MQn5G0LO2Cavt29ghgdUWfltRmZmY10tMBcTmwH9BEcVXYb3V1BZKmS1okadG6deu6uz4zM0t6NCAi4rmIaI2IvwJXsGU30hpgZEXXxtSWW8eciGiOiOaGBt+zyMysLD0aEJKGV0yeBLSd4TQfmCppV0ljgAOAhT1Zm5mZvVVpV2SVNA84BhgmqQW4EDhGUhMQwJ+AswEiYoWkGyku5bEZmBERrWXVZmZmW1daQETEtEzzlZ30vxhfIdbMrG74m9RmZpblm/6Y1anDzru21iXUjcX/fkqtS+iVPIIwM7MsB4SZmWU5IMzMLMsBYWZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWAMDOzLAeEmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZZUWEJKukrRW0vKKtiGS7pb0RPq5V2qXpO9KWiVpmaR3lVWXmZlVp8wRxDXApHZt5wP3RsQBwL1pGuCDwAHpMR24vMS6zMysCqUFREQsAF5o13wiMDc9nwtMqWi/NgoPAoMlDS+rNjMz27qePgaxT0Q8k54/C+yTno8AVlf0a0ltZmZWIzU7SB0RAURXl5M0XdIiSYvWrVtXQmVmZgY9HxDPte06Sj/XpvY1wMiKfo2p7W9ExJyIaI6I5oaGhlKLNTPrzXo6IOYDp6bnpwK3VLSfks5mOhLYULEryszMaqBvWSuWNA84BhgmqQW4EPg6cKOkM4E/Ayen7rcDk4FVwEbg9LLqMjOz6pQWEBExrYNZx2X6BjCjrFrMzKzr/E1qMzPLKm0EYWbWXZ6edXCtS6gboy54tMe25RGEmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZTkgzMwsywFhZmZZDggzM8tyQJiZWZYDwszMshwQZmaW5YAwM7MsB4SZmWXV5I5ykv4EvAy0ApsjolnSEOAGYDTwJ+DkiHixFvWZmVltRxDvj4imiGhO0+cD90bEAcC9adrMzGqknnYxnQjMTc/nAlNqWIuZWa9Xq4AI4JeSFkuantr2iYhn0vNngX1qU5qZmUGNjkEAR0XEGkl7A3dL+kPlzIgISZFbMAXKdIBRo0aVX6mZWS9VkxFERKxJP9cCvwAmAM9JGg6Qfq7tYNk5EdEcEc0NDQ09VbKZWa/T4wEh6W2SBrU9B/4PsByYD5yaup0K3NLTtZmZ2Ra12MW0D/ALSW3b/2lE3CnpIeBGSWcCfwZOrkFtZmaW9HhARMQfgUMz7euB43q6HjMzy6un01zNzKyOOCDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZTkgzMwsywFhZmZZDggzM8tyQJiZWZYDwszMshwQZmaW5YAwM7MsB4SZmWU5IMzMLMsBYWZmWQ4IMzPLqruAkDRJ0uOSVkk6v9b1mJn1VnUVEJL6AP8BfBAYC0yTNLa2VZmZ9U51FRDABGBVRPwxIl4HrgdOrHFNZma9Ur0FxAhgdcV0S2ozM7Me1rfWBXSVpOnA9DT5iqTHa1nPzmRfGAY8X+s66sKFqnUFVsH/Nit0z7/NfavpVG8BsQYYWTHdmNreFBFzgDk9WVRvIWlRRDTXug6z9vxvszbqbRfTQ8ABksZI6g9MBebXuCYzs16prkYQEbFZ0meAu4A+wFURsaLGZZmZ9Up1FRAAEXE7cHut6+ilvOvO6pX/bdaAIqLWNZiZWR2qt2MQZmZWJxwQ5subWN2SdJWktZKW17qW3sgB0cv58iZW564BJtW6iN7KAWG+vInVrYhYALxQ6zp6KweE+fImZpblgDAzsywHhG318iZm1js5IMyXNzGzLAdELxcRm4G2y5s8Btzoy5tYvZA0D/gdcJCkFkln1rqm3sTfpDYzsyyPIMzMLMsBYWZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWA6GUkDZW0ND2elbSmYrp/u753SRpUq1rbk/QbSU2Z9m2qU9JYSY9IeljS6A769JX0UgfzfiJpSifr/5ykAVWsZ4akT3SynuMl3dzZa+kJks6SFJKOqWj7WGqbkqavlnRQF9d7kqTzurlc6wZ1d8tRK1dErAeaACTNBF6JiEsq+0gSxXdkJvZ8hV23HXV+BJgXEV/vznoqfA64CtjUWaeI+I+Stl+GRym+bf9Amp4GPNI2MyJO7+oKI+IX3VKZdTuPIAwASftLWinpOmAFMDx9c3Vwmn+6pGXpL+6rU9s+kn4uaZGkhZKOTO1fkzRX0oOSnpB0RmofkUYBSyUtl/SeDmrpK+nHkh5N/f6l3fw+6a/3mWm6RdLg9BqWS7pS0gpJd7T9BZ/ZxocpvkF+jqR7UtsX0/LLJZ2TWWYXSd+X9AdJdwPDOnk//xXYG/h12/pT+9fTe/g7SXtXvF/npucHSrov9VnSfmQj6YjUPiYtd6WkX0n6o6QZFf1OTb+TpanmXTp6XyX9a/rdL5P0k45eU/IA8J60rj2AUcCbN/NpG+V1ZVtpZHJpev4TSd+R9N/pNZ2U2vtI+kF6738p6U51Mnqz7uERhFV6B3BKRCwCKAYSIOlQ4EvAeyLiBUlDUv/vAt+MiAfTB9ltwDvTvIOB9wB7AEsk/RfwSeDWiPiGihsVDeygjsOAYRFxcNr+4Ip5/YB5wOKI+EZm2YOAaRHxqKSfA1Mo7nHxFhExX9IE4PmIuFTSEcAngMMp/l8slPQAxeVH2nwMGENxY6W/B1YCP8i9gIiYLenzwPsi4iVJfYE9gV9FxPmSvg2cAbQfvcwDZkbErSncdgH2T+/D+4DZwIcjoiX9fg4EjgMGA49J+gHwD8BJFL+vzZLmUPzV/2QH7+sXgX0j4vV273XOXylC4nhgH+DmtL32OvodVrOtvYH3UvwbuhH4BfBxisvQjwX+juL3kn3vrft4BGGVnmwLh3aOBW6IiBcA2n5SfEj8QNJSig+KvSS1fejfHBGbImItsIDig/ch4CxJFwLvjIhXOqhjFcW1d74raSKwoWLej+g4HKC4+dGj6fliYPRWXnObo4CfRcSrEfFyej3va9fnaIpdUn+NiBa27Gap1qsRcUdHtUnai+JD9VaA9P5tTLPfCXwf+L9p221ui4jX0/v8AtBA8Xs5HFiUfjf/COxHx+/rCuAnKo6DvFHF67ieInCmkgnfZHu2dXMUlrHl3iRHUVwn7K8R8T/Ar6qo07aTA8Iq/aWL/QVMiIim9BgREa+mee0v8hURcR9wDPAMcK06ODCbjpMcAvwamAH8sGL2fwPHSdq1g5peq3jeSn2Nkl+veN7V2v6H4gP10Hbtudcr4KqK38tBEfFvnbyvEyn+Gj+cYuTUZyu1/I5ihLBHRDyZ67Cd26p8TdpKLVYiB4RV4z7gn9p2LVXsYrqH4j8/qb3yDKMpknaV1EDxl/giSfsCz0bEHOBqYHxuY2kZRcR/AhcA76qY/cO03evTbpvu8mvgJEkDJe1OcdvVX7frs4DifdhF0giKv8w78zJQ9dlVEfEisE7SCQCSBkjaLc1+AfgQcEna1dSZe4CTJQ1L6xkqaVTufU0f0I0pvL9IcVxlt45WnOoM4HzgKx316a5tVfgt8DEVhlOM5qxk9fTXldWpiHhE0jeBBZI2U+weOZMiHC6XdDrFv6X72RIYyyl2AwwFLoyI51QcrP6cpDcoPjw/1cEmRwJXqtjJHhTHPyrr+aaki4FrJJ3STa9xoYpLSz+Umi5PxzEq/4/cBLyf4tjD0xR/SXdmDnCPpNXApCpL+QTww/T6Xgc+WlHjMyk8bu/sdae6L0rb3oVi5PHPFCOM9u9rX+CnKk4T3gW4JO1i61RE/NdWuuR+h9lttR3r2oobKXZ1Pgb8GXiYt+56tBL4ct/W7SR9jXTwt9a12M5D0u4R8UoanfweOCIi1tW6rp2ZRxBmtqO4I51a249iVOpwKJlHEFZTkhbxt3+o/L+IWNmN2/gBcGS75m9HxLXdtP75FN8HqPSFiLgn17/eSTqL4jsilRZExL/k+tvOywFhZmZZPovJzMyyHBBmZpblgDAzsywHhJmZZTkgzMws6/8D7ATRhf7U68UAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "#color = sns.color_palette()\n",
    "\n",
    "%matplotlib inline\n",
    "sns.countplot(x=\"Triceps_skin_fold_thickness_Missing\", hue=\"Target\",data=train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x11511c550>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAELCAYAAADDZxFQAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAFUlJREFUeJzt3XuQXOV55/HvYyFFXLQICUGwBpDAQCIZENYA9oZQNrBBlmMutqNFldhgMMIb7IXdQIV4vSCropS3fAEbNmwBJohdzCW2QZhVca0QVVzGQmAuQoSADUajBSRkLANCgMbP/tFnRCNeST3DnOke5vup6upz3n7fc57umppfn2tHZiJJ0pbe1+4CJEmdyYCQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKjIgJElFBoQkqWiHdhfwbuy+++45ZcqUdpchScPKAw888GJmTtpev2EdEFOmTGH58uXtLkOShpWI+FUr/dzFJEkqMiAkSUUGhCSpaFgfg5CkdnnzzTfp6elh48aN7S5lq8aOHUtXVxejR48e0HgDQpIGoKenh3HjxjFlyhQiot3lvENmsm7dOnp6epg6deqAluEuJkkagI0bNzJx4sSODAeAiGDixInvagvHgJCkAerUcOjzbuszICRJRR6DkKRBsG7dOo499lgAnn/+eUaNGsWkSY2LlZctW8aYMWMGfZ0PPvgga9asYdasWYO+bDAgmHn+te0uoWM88I3PtbsEadiaOHEiDz30EADz589nl1124bzzzmt5fG9vL6NGjerXOh988EFWrFhRW0C4i0mSavbJT36SmTNnMn36dK666ioANm3axPjx4zn33HM55JBDWLZsGbfeeisHHXQQM2fO5Mtf/jInnXQSAK+88gqnnXYaRxxxBIcddhg//vGPee2111iwYAHXXXcdM2bM4Ac/+MGg1z3ityAkqW6LFi1iwoQJbNiwge7ubj796U8zbtw41q9fz9FHH80ll1zChg0bOPDAA/nJT37CPvvsw5w5czaPX7BgAbNmzeKaa67hpZde4sgjj+SRRx7hwgsvZMWKFVxyySW11O0WhCTV7OKLL+bQQw/lIx/5CD09PfziF78AYMyYMZx88skArFy5koMOOoh9992XiGDu3Lmbx995550sXLiQGTNm8LGPfYyNGzfy7LPP1l63WxCSVKO7776bpUuXct9997Hjjjty1FFHbb42Yccdd2zpVNTM5JZbbmH//fd/W/vSpUtrqbmPWxCSVKP169czYcIEdtxxRx577DHuv//+Yr9p06bxxBNPsGrVKjKTG2+8cfNrxx9/PJdeeunm+Z///OcAjBs3jpdffrm22g0ISarRJz7xCTZs2MC0adP46le/ypFHHlnst9NOO3HZZZdx3HHH0d3dzfjx49l1110BuOiii3j11Vc5+OCDmT59OvPnzwfgmGOO4eGHH+awww4bXgepI2Jv4FpgTyCBKzLzOxExHzgTWFt1/UpmLqnG/A1wBtAL/OfMvKOu+iSpLn3/wKFxw7w77ij/K/vNb37ztvnjjjuOJ554gszkrLPOoru7G4Cdd96ZK6+88h3jJ02aVOuPptV5DGIT8FeZ+WBEjAMeiIi7qtcuzsxvNneOiGnAKcB04P3A3RFxYGb21lijJHWMyy+/nOuuu47XX3+d7u5uzjzzzLbWU1tAZOZzwHPV9MsR8TgweRtDTgRuyMzXgacj4ingCOCnddUoSZ3k/PPP5/zzz293GZsNyTGIiJgCHAb8rGr6UkQ8EhFXR8RuVdtkYFXTsB62HSiSpBrVHhARsQvwQ+DczPwtcDmwPzCDxhbGt/q5vHkRsTwilq9du3b7AyRJA1JrQETEaBrhcF1m/gggM1/IzN7M/B1wJY3dSACrgb2bhndVbW+TmVdkZndmdvfdCEuSNPhqC4hoXP3xPeDxzPx2U/teTd1OBlZU07cCp0TE70XEVOAAYFld9UmStq3Os5j+CPgs8GhEPFS1fQWYGxEzaJz6+gxwFkBmPhYRNwEraZwBdbZnMEkaLgb7ztCt3F359ttv55xzzqG3t5cvfOELXHDBBYNaQ51nMf0LULqGfMk2xiwEFtZVkyS9V/T29nL22Wdz11130dXVxeGHH84JJ5zAtGnTBm0dXkktScPQsmXL+MAHPsB+++3HmDFjOOWUU1i8ePGgrsOAkKRhaPXq1ey991vn9XR1dbF69TvO63lXDAhJUpEBIUnD0OTJk1m16q1ri3t6epg8eXCvLTYgJGkYOvzww3nyySd5+umneeONN7jhhhs44YQTBnUd/mCQJA2CVk5LHUw77LADl112Gccffzy9vb2cfvrpTJ8+fXDXMahLkzRoBvu8+uFsqP/5DhezZ89m9uzZtS3fXUySpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRZ7mKkmD4NkFBw/q8va58NHt9jn99NO57bbb2GOPPVixYsV2+/eXWxCSNEyddtpp3H777bUt34CQpGHq6KOPZsKECbUt34CQJBUZEJKkIgNCklRkQEiSijzNVZIGQSunpQ62uXPncu+99/Liiy/S1dXF1772Nc4444xBW74BIUnD1PXXX1/r8t3FJEkqMiAkSUUGhCQNUGa2u4Rterf1GRCSNABjx45l3bp1HRsSmcm6desYO3bsgJfhQWpJGoCuri56enpYu3Ztu0vZqrFjx9LV1TXg8QaEJA3A6NGjmTp1arvLqJW7mCRJRQaEJKmotoCIiL0j4p8iYmVEPBYR51TtEyLiroh4snrerWqPiPhuRDwVEY9ExIfqqk2StH11bkFsAv4qM6cBHwbOjohpwAXAPZl5AHBPNQ/wceCA6jEPuLzG2iRJ21FbQGTmc5n5YDX9MvA4MBk4EVhUdVsEnFRNnwhcmw33AeMjYq+66pMkbduQHIOIiCnAYcDPgD0z87nqpeeBPavpycCqpmE9VZskqQ1qD4iI2AX4IXBuZv62+bVsXGHSr6tMImJeRCyPiOWdfP6xJA13tQZERIymEQ7XZeaPquYX+nYdVc9rqvbVwN5Nw7uqtrfJzCsyszszuydNmlRf8ZI0wtV5FlMA3wMez8xvN710K3BqNX0qsLip/XPV2UwfBtY37YqSJA2xOq+k/iPgs8CjEfFQ1fYV4OvATRFxBvArYE712hJgNvAUsAH4fI21SZK2o7aAyMx/AWIrLx9b6J/A2XXVI0nqH6+kliQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSqqLSAi4uqIWBMRK5ra5kfE6oh4qHrMbnrtbyLiqYh4IiKOr6suSVJrWgqIiLinlbYtXAPMKrRfnJkzqseSalnTgFOA6dWYv4+IUa3UJkmqxzYDIiLGRsQEYPeI2C0iJlSPKcDkbY3NzKXAr1us40Tghsx8PTOfBp4CjmhxrCSpBtvbgjgLeAD4g+q577EYuGyA6/xSRDxS7YLarWqbDKxq6tPDdgJIklSvbQZEZn4nM6cC52Xmfpk5tXocmpkDCYjLgf2BGcBzwLf6u4CImBcRyyNi+dq1awdQgiSpFTu00ikzL42Ifw9MaR6Tmdf2Z2WZ+ULfdERcCdxWza4G9m7q2lW1lZZxBXAFQHd3d/Zn/ZKk1rUUEBHxv2l8838I6K2aE+hXQETEXpn5XDV7MtB3htOtwPcj4tvA+4EDgGX9WbYkaXC1FBBANzAtM1v+xh4R1wMfpXGAuwe4CPhoRMygES7P0DjGQWY+FhE3ASuBTcDZmdlbWq4kaWi0GhArgN+ncdygJZk5t9D8vW30XwgsbHX5kqR6tRoQuwMrI2IZ8HpfY2aeUEtVkqS2azUg5tdZhCSp87R6FtM/112IJKmztHoW08s0DiwDjAFGA69m5r+rqzBJUnu1ugUxrm86IoLGrTE+XFdRkqT2a/UYxGbVqa63RMRFwAWDX5Ikvd2zCw5udwkdY58LHx2ydbW6i+lTTbPvo3FdxMZaKpIkdYRWtyA+2TS9icZFbicOejWSpI7R6jGIz9ddiCSps7T6g0FdEXFz9QtxayLihxHRVXdxkqT2afUnR/+Bxg313l89fly1SZLeo1o9BjEpM5sD4ZqIOLeOgtQ+ninylqE8U0TqVK1uQayLiL+IiFHV4y+AdXUWJklqr1YD4nRgDvA8jTu6fgY4raaaJEkdoNVdTAuAUzPzJYCImAB8k0ZwSJLeg1rdgjikLxwAMvPXwGH1lCRJ6gStBsT7ImK3vplqC6Lft+mQJA0frf6T/xbw04j4x2r+z/DX3yTpPa3VK6mvjYjlwDFV06cyc2V9ZUmS2q3l3URVIBgKkjRCtHoMQpI0whgQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklRUW0BExNURsSYiVjS1TYiIuyLiyep5t6o9IuK7EfFURDwSER+qqy5JUmvq3IK4Bpi1RdsFwD2ZeQBwTzUP8HHggOoxD7i8xrokSS2oLSAycynw6y2aTwQWVdOLgJOa2q/NhvuA8RGxV121SZK2b6iPQeyZmc9V088De1bTk4FVTf16qrZ3iIh5EbE8IpavXbu2vkolaYRr20HqzEwgBzDuiszszszuSZMm1VCZJAmGPiBe6Nt1VD2vqdpXA3s39euq2iRJbTLUAXErcGo1fSqwuKn9c9XZTB8G1jftipIktUHLPznaXxFxPfBRYPeI6AEuAr4O3BQRZwC/AuZU3ZcAs4GngA3A5+uqS5LUmtoCIjPnbuWlYwt9Ezi7rlokSf3nldSSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRTu0Y6UR8QzwMtALbMrM7oiYANwITAGeAeZk5kvtqE+S1N4tiI9l5ozM7K7mLwDuycwDgHuqeUlSm3TSLqYTgUXV9CLgpDbWIkkjXrsCIoE7I+KBiJhXte2Zmc9V088De7anNEkStOkYBHBUZq6OiD2AuyLiX5tfzMyMiCwNrAJlHsA+++xTf6WSNEK1ZQsiM1dXz2uAm4EjgBciYi+A6nnNVsZekZndmdk9adKkoSpZkkacIQ+IiNg5Isb1TQN/AqwAbgVOrbqdCiwe6tokSW9pxy6mPYGbI6Jv/d/PzNsj4n7gpog4A/gVMKcNtUmSKkMeEJn5S+DQQvs64NihrkeSVNZJp7lKkjqIASFJKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKjIgJElFBoQkqciAkCQVGRCSpKKOC4iImBURT0TEUxFxQbvrkaSRqqMCIiJGAf8T+DgwDZgbEdPaW5UkjUwdFRDAEcBTmfnLzHwDuAE4sc01SdKI1GkBMRlY1TTfU7VJkobYDu0uoL8iYh4wr5p9JSKeaGc97yX7wu7Ai+2uoyNcFO2uQE3822wyOH+b+7bSqdMCYjWwd9N8V9W2WWZeAVwxlEWNFBGxPDO7212HtCX/Ntuj03Yx3Q8cEBFTI2IMcApwa5trkqQRqaO2IDJzU0R8CbgDGAVcnZmPtbksSRqROiogADJzCbCk3XWMUO66U6fyb7MNIjPbXYMkqQN12jEISVKHMCDk7U3UsSLi6ohYExEr2l3LSGRAjHDe3kQd7hpgVruLGKkMCHl7E3WszFwK/LrddYxUBoS8vYmkIgNCklRkQGi7tzeRNDIZEPL2JpKKDIgRLjM3AX23N3kcuMnbm6hTRMT1wE+BgyKiJyLOaHdNI4lXUkuSityCkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyIKRKRHwxIj43yMu8JiI+U01fNZA75UbE/IjIiPhAU9u5VVt3Nb8kIsb3c7mD/n713tJxPzkqbU9E7FBd4DeoMvN/DfYyt1j+F97F8EdpXOX+t9X8nwGbL2jMzNkDqKfW96vhzy0ItU1E7BwR/zciHo6IFRHxHyNiZkT8c0Q8EBF3RMReVd97I+KSiFgOnNP8zbx6/ZXq+aPV+MUR8cuI+HpE/HlELIuIRyNi/23UMz8izmta3/+oxv1bRPxx1T69ansoIh6JiAMiYkrzD9pExHkRMb+w/HubvvG/EhELq/d+X0TsuZ2P6xaq27BX72E98GLTsp+JiN1Ln2n1+tcjYmVV8zf78X53ioibqrE3R8TP+t6D3vsMCLXTLOD/ZeahmflB4HbgUuAzmTkTuBpY2NR/TGZ2Z+a3trPcQ4EvAn8IfBY4MDOPAK4CvtyP+naoxp0LXFS1fRH4TmbOALpp3B59IHYG7svMQ4GlwJnb6f9bYFVEfJDGlsSNW+n3js80IiYCJwPTM/MQ3toK2VLp/f4l8FJmTgP+OzCztben9wIDQu30KPAfqm+uf0zjrrIfBO6KiIeAr9K4u2yfrf1T3NL9mflcZr4O/AK4s2l9U/pR34+q5weaxv0U+EpE/DWwb2a+1o/lNXsDuK2w/G25gUY4nATcvJU+b/tMM3M9ja2NjcD3IuJTwIatjC2936Oq9ZKZK4BHWqhT7xEGhNomM/8N+BCNf2p/C3waeCwzZ1SPgzPzT5qGvNo0vYnq7zci3geMaXrt9abp3zXN/47+HXfrG9fbNy4zvw+cALwGLImIY5prqYxtYdlv5ls3Qtu8/O24jcYW0bOZ+dtShy0/04i4sDpecwTwA+BPaWyplbzj/WpkMyDUNhHxfmBDZv4f4BvAkcCkiPhI9froiJi+leHP8NbujhOA0TWXS1XTfsAvM/O7wGLgEOAFYI+ImBgRv0fjn/Cgy8wNwF/z9t1uW9a35Wf6oYjYBdg1M5cA/4XGLrhW/QSYUy17GnDwAMvXMOS3BLXTwcA3IuJ3wJvAf6Lxbfy7EbErjb/PS2g6W6fJlcDiiHiYxjfiVwt96jAH+GxEvAk8D/xdZr4ZEQuAZTR+bOlf61p5Zt6wnS6lz3Qcjc9qLBDAf+3HKv8eWBQRK2m8r8do7LLSCODtviVtVUSMAkZn5sbq7Km7gYMy8402l6Yh4BaEpG3ZCfiniBhNY+vjLw2HkcMtCI04EfHfaFxo1uwfM3Or+/aHQqfWpZHLgJAkFXkWkySpyICQJBUZEJKkIgNCklRkQEiSiv4/bAC/xG+EblgAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "train['serum_insulin_Missing'] = train['serum_insulin'].apply(lambda x: 1 if pd.isnull(x) else 0)\n",
    "sns.countplot(x=\"serum_insulin_Missing\", hue=\"Target\",data=train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.drop([\"Triceps_skin_fold_thickness_Missing\", \"serum_insulin_Missing\"], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pregnants                       0\n",
      "Plasma_glucose_concentration    0\n",
      "blood_pressure                  0\n",
      "Triceps_skin_fold_thickness     0\n",
      "serum_insulin                   0\n",
      "BMI                             0\n",
      "Diabetes_pedigree_function      0\n",
      "Age                             0\n",
      "Target                          0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "medians = train.median() \n",
    "train = train.fillna(medians)\n",
    "\n",
    "print(train.isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv(\"pima-indians-diabetes.csv\")\n",
    "\n",
    "y = train['Target']   \n",
    "X_train_org = train.drop([\"Target\"], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_org = X_train_org.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/ys/anaconda3/lib/python2.7/site-packages/sklearn/preprocessing/data.py:625: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.\n",
      "  return self.partial_fit(X, y)\n",
      "/Users/ys/anaconda3/lib/python2.7/site-packages/sklearn/base.py:462: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.\n",
      "  return self.fit(X, **fit_params).transform(X)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.639947</td>\n",
       "      <td>0.848324</td>\n",
       "      <td>0.149641</td>\n",
       "      <td>0.907270</td>\n",
       "      <td>-0.692891</td>\n",
       "      <td>0.204013</td>\n",
       "      <td>0.468492</td>\n",
       "      <td>1.425995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-1.123396</td>\n",
       "      <td>-0.160546</td>\n",
       "      <td>0.530902</td>\n",
       "      <td>-0.692891</td>\n",
       "      <td>-0.684422</td>\n",
       "      <td>-0.365061</td>\n",
       "      <td>-0.190672</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.233880</td>\n",
       "      <td>1.943724</td>\n",
       "      <td>-0.263941</td>\n",
       "      <td>-1.288212</td>\n",
       "      <td>-0.692891</td>\n",
       "      <td>-1.103255</td>\n",
       "      <td>0.604397</td>\n",
       "      <td>-0.105584</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-0.998208</td>\n",
       "      <td>-0.160546</td>\n",
       "      <td>0.154533</td>\n",
       "      <td>0.123302</td>\n",
       "      <td>-0.494043</td>\n",
       "      <td>-0.920763</td>\n",
       "      <td>-1.041549</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-1.141852</td>\n",
       "      <td>0.504055</td>\n",
       "      <td>-1.504687</td>\n",
       "      <td>0.907270</td>\n",
       "      <td>0.765836</td>\n",
       "      <td>1.409746</td>\n",
       "      <td>5.484909</td>\n",
       "      <td>-0.020496</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0   0.639947                      0.848324        0.149641   \n",
       "1  -0.844885                     -1.123396       -0.160546   \n",
       "2   1.233880                      1.943724       -0.263941   \n",
       "3  -0.844885                     -0.998208       -0.160546   \n",
       "4  -1.141852                      0.504055       -1.504687   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin       BMI  \\\n",
       "0                     0.907270      -0.692891  0.204013   \n",
       "1                     0.530902      -0.692891 -0.684422   \n",
       "2                    -1.288212      -0.692891 -1.103255   \n",
       "3                     0.154533       0.123302 -0.494043   \n",
       "4                     0.907270       0.765836  1.409746   \n",
       "\n",
       "   Diabetes_pedigree_function       Age  \n",
       "0                    0.468492  1.425995  \n",
       "1                   -0.365061 -0.190672  \n",
       "2                    0.604397 -0.105584  \n",
       "3                   -0.920763 -1.041549  \n",
       "4                    5.484909 -0.020496  "
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 数据标准化\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "ss_org = StandardScaler()\n",
    "X_train_org = ss_org.fit_transform(X_train_org)\n",
    "\n",
    "X_train_org = pd.DataFrame(columns = columns_org, data = X_train_org)\n",
    "X_train_org.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_org = pd.concat([X_train_org, y], axis = 1)\n",
    "train_org.to_csv('./data/FE_diabetes-org.csv',index = False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/ys/anaconda3/lib/python2.7/site-packages/ipykernel_launcher.py:2: RuntimeWarning: invalid value encountered in log1p\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "# LOG\n",
    "X_train_log = np.log1p(X_train_org)\n",
    "columns_log = columns_org + \"_log\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants_log</th>\n",
       "      <th>Plasma_glucose_concentration_log</th>\n",
       "      <th>blood_pressure_log</th>\n",
       "      <th>Triceps_skin_fold_thickness_log</th>\n",
       "      <th>serum_insulin_log</th>\n",
       "      <th>BMI_log</th>\n",
       "      <th>Diabetes_pedigree_function_log</th>\n",
       "      <th>Age_log</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.726987</td>\n",
       "      <td>0.723606</td>\n",
       "      <td>0.185978</td>\n",
       "      <td>0.641268</td>\n",
       "      <td>-0.921785</td>\n",
       "      <td>0.313496</td>\n",
       "      <td>0.683023</td>\n",
       "      <td>1.090723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-1.611777</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.192353</td>\n",
       "      <td>0.201926</td>\n",
       "      <td>-0.921785</td>\n",
       "      <td>-1.048172</td>\n",
       "      <td>0.003389</td>\n",
       "      <td>0.218194</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.033509</td>\n",
       "      <td>1.139087</td>\n",
       "      <td>-0.350496</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.921785</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.754768</td>\n",
       "      <td>0.297648</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-1.611777</td>\n",
       "      <td>-5.470782</td>\n",
       "      <td>-0.192353</td>\n",
       "      <td>-0.362024</td>\n",
       "      <td>0.593035</td>\n",
       "      <td>-0.568138</td>\n",
       "      <td>-1.683477</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.539599</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.641268</td>\n",
       "      <td>1.121428</td>\n",
       "      <td>1.019098</td>\n",
       "      <td>1.886918</td>\n",
       "      <td>0.369875</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants_log  Plasma_glucose_concentration_log  blood_pressure_log  \\\n",
       "0       0.726987                          0.723606            0.185978   \n",
       "1      -1.611777                               NaN           -0.192353   \n",
       "2       1.033509                          1.139087           -0.350496   \n",
       "3      -1.611777                         -5.470782           -0.192353   \n",
       "4            NaN                          0.539599                 NaN   \n",
       "\n",
       "   Triceps_skin_fold_thickness_log  serum_insulin_log   BMI_log  \\\n",
       "0                         0.641268          -0.921785  0.313496   \n",
       "1                         0.201926          -0.921785 -1.048172   \n",
       "2                              NaN          -0.921785       NaN   \n",
       "3                        -0.362024           0.593035 -0.568138   \n",
       "4                         0.641268           1.121428  1.019098   \n",
       "\n",
       "   Diabetes_pedigree_function_log   Age_log  \n",
       "0                        0.683023  1.090723  \n",
       "1                        0.003389  0.218194  \n",
       "2                        0.754768  0.297648  \n",
       "3                       -1.683477       NaN  \n",
       "4                        1.886918  0.369875  "
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "ss_log = StandardScaler()\n",
    "X_train_log = ss_log.fit_transform(X_train_log)\n",
    "\n",
    "X_train_log = pd.DataFrame(columns = columns_log, data = X_train_log)\n",
    "X_train_log.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_log = pd.concat([X_train_log, y], axis = 1)\n",
    "train_log.to_csv('./data/FE_diabetes-log.csv',index = False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "# transform counts to TFIDF features\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "tfidf = TfidfTransformer()\n",
    "\n",
    "X_train_tfidf = tfidf.fit_transform(X_train_org).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants_tfidf</th>\n",
       "      <th>Plasma_glucose_concentration_tfidf</th>\n",
       "      <th>blood_pressure_tfidf</th>\n",
       "      <th>Triceps_skin_fold_thickness_tfidf</th>\n",
       "      <th>serum_insulin_tfidf</th>\n",
       "      <th>BMI_tfidf</th>\n",
       "      <th>Diabetes_pedigree_function_tfidf</th>\n",
       "      <th>Age_tfidf</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.293647</td>\n",
       "      <td>0.389263</td>\n",
       "      <td>0.068664</td>\n",
       "      <td>0.416311</td>\n",
       "      <td>-0.317941</td>\n",
       "      <td>0.093614</td>\n",
       "      <td>0.214973</td>\n",
       "      <td>0.654334</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.458093</td>\n",
       "      <td>-0.609101</td>\n",
       "      <td>-0.087047</td>\n",
       "      <td>0.287852</td>\n",
       "      <td>-0.375682</td>\n",
       "      <td>-0.371091</td>\n",
       "      <td>-0.197934</td>\n",
       "      <td>-0.103381</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.408951</td>\n",
       "      <td>0.644218</td>\n",
       "      <td>-0.087479</td>\n",
       "      <td>-0.426959</td>\n",
       "      <td>-0.229648</td>\n",
       "      <td>-0.365657</td>\n",
       "      <td>0.200318</td>\n",
       "      <td>-0.034994</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.425010</td>\n",
       "      <td>-0.502137</td>\n",
       "      <td>-0.080761</td>\n",
       "      <td>0.077736</td>\n",
       "      <td>0.062026</td>\n",
       "      <td>-0.248523</td>\n",
       "      <td>-0.463179</td>\n",
       "      <td>-0.523940</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.186954</td>\n",
       "      <td>0.082528</td>\n",
       "      <td>-0.246360</td>\n",
       "      <td>0.148546</td>\n",
       "      <td>0.125389</td>\n",
       "      <td>0.230816</td>\n",
       "      <td>0.898036</td>\n",
       "      <td>-0.003356</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants_tfidf  Plasma_glucose_concentration_tfidf  blood_pressure_tfidf  \\\n",
       "0         0.293647                            0.389263              0.068664   \n",
       "1        -0.458093                           -0.609101             -0.087047   \n",
       "2         0.408951                            0.644218             -0.087479   \n",
       "3        -0.425010                           -0.502137             -0.080761   \n",
       "4        -0.186954                            0.082528             -0.246360   \n",
       "\n",
       "   Triceps_skin_fold_thickness_tfidf  serum_insulin_tfidf  BMI_tfidf  \\\n",
       "0                           0.416311            -0.317941   0.093614   \n",
       "1                           0.287852            -0.375682  -0.371091   \n",
       "2                          -0.426959            -0.229648  -0.365657   \n",
       "3                           0.077736             0.062026  -0.248523   \n",
       "4                           0.148546             0.125389   0.230816   \n",
       "\n",
       "   Diabetes_pedigree_function_tfidf  Age_tfidf  \n",
       "0                          0.214973   0.654334  \n",
       "1                         -0.197934  -0.103381  \n",
       "2                          0.200318  -0.034994  \n",
       "3                         -0.463179  -0.523940  \n",
       "4                          0.898036  -0.003356  "
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "columns_tfidf = columns_org + \"_tfidf\"\n",
    "X_train_tfidf = pd.DataFrame(columns = columns_tfidf, data = X_train_tfidf)\n",
    "X_train_tfidf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "feat_names_tfidf = X_train_tfidf.columns\n",
    "ms_tfidf = StandardScaler()\n",
    "X_train_tfidf = ms_tfidf.fit_transform(X_train_tfidf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants_tfidf</th>\n",
       "      <th>Plasma_glucose_concentration_tfidf</th>\n",
       "      <th>blood_pressure_tfidf</th>\n",
       "      <th>Triceps_skin_fold_thickness_tfidf</th>\n",
       "      <th>serum_insulin_tfidf</th>\n",
       "      <th>BMI_tfidf</th>\n",
       "      <th>Diabetes_pedigree_function_tfidf</th>\n",
       "      <th>Age_tfidf</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.866566</td>\n",
       "      <td>1.147929</td>\n",
       "      <td>0.175537</td>\n",
       "      <td>1.054853</td>\n",
       "      <td>-0.908318</td>\n",
       "      <td>0.317545</td>\n",
       "      <td>0.724776</td>\n",
       "      <td>1.908624</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-1.151198</td>\n",
       "      <td>-1.552080</td>\n",
       "      <td>-0.336402</td>\n",
       "      <td>0.722680</td>\n",
       "      <td>-1.084406</td>\n",
       "      <td>-1.054618</td>\n",
       "      <td>-0.480251</td>\n",
       "      <td>-0.144743</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.176056</td>\n",
       "      <td>1.837438</td>\n",
       "      <td>-0.337822</td>\n",
       "      <td>-1.125703</td>\n",
       "      <td>-0.639062</td>\n",
       "      <td>-1.038576</td>\n",
       "      <td>0.682009</td>\n",
       "      <td>0.040582</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-1.062399</td>\n",
       "      <td>-1.262805</td>\n",
       "      <td>-0.315733</td>\n",
       "      <td>0.179355</td>\n",
       "      <td>0.250423</td>\n",
       "      <td>-0.692704</td>\n",
       "      <td>-1.254342</td>\n",
       "      <td>-1.284434</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.423426</td>\n",
       "      <td>0.318385</td>\n",
       "      <td>-0.860181</td>\n",
       "      <td>0.362457</td>\n",
       "      <td>0.443656</td>\n",
       "      <td>0.722671</td>\n",
       "      <td>2.718226</td>\n",
       "      <td>0.126321</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants_tfidf  Plasma_glucose_concentration_tfidf  blood_pressure_tfidf  \\\n",
       "0         0.866566                            1.147929              0.175537   \n",
       "1        -1.151198                           -1.552080             -0.336402   \n",
       "2         1.176056                            1.837438             -0.337822   \n",
       "3        -1.062399                           -1.262805             -0.315733   \n",
       "4        -0.423426                            0.318385             -0.860181   \n",
       "\n",
       "   Triceps_skin_fold_thickness_tfidf  serum_insulin_tfidf  BMI_tfidf  \\\n",
       "0                           1.054853            -0.908318   0.317545   \n",
       "1                           0.722680            -1.084406  -1.054618   \n",
       "2                          -1.125703            -0.639062  -1.038576   \n",
       "3                           0.179355             0.250423  -0.692704   \n",
       "4                           0.362457             0.443656   0.722671   \n",
       "\n",
       "   Diabetes_pedigree_function_tfidf  Age_tfidf  \n",
       "0                          0.724776   1.908624  \n",
       "1                         -0.480251  -0.144743  \n",
       "2                          0.682009   0.040582  \n",
       "3                         -1.254342  -1.284434  \n",
       "4                          2.718226   0.126321  "
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_tfidf = pd.DataFrame(columns = columns_tfidf, data = X_train_tfidf)\n",
    "X_train_tfidf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_tfidf = pd.concat([X_train_tfidf, y], axis = 1)\n",
    "train_tfidf.to_csv('./data/FE_diabetes-tfidf.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cPickle\n",
    "\n",
    "cPickle.dump(ss_org, open(\"ss_org.pkl\", 'wb'))\n",
    "cPickle.dump(ss_log, open(\"ss_log.pkl\", 'wb'))\n",
    "cPickle.dump(ss_log, open(\"ss_tfdif.pkl\", 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
